#include <crt_utf8.h>

/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
 * with particular attention to canonical translation forms (see section 10
 * "Security Considerations" of the RFC for more info).
 *
 * Since several architectures including Windows support unicode, with UCS2
 * used as the actual storage conventions by that archicture, these functions
 * exist to transform or validate UCS2 strings into APR's 'char' type
 * convention.  It is left up to the operating system to determine the
 * validitity of the string, e.g. normative forms, in the context of
 * its native language support.  Other file systems which support filename
 * characters of 0x80-0xff but have no explicit requirement for Unicode
 * will find this function useful only for validating the character sequences
 * and rejecting poorly encoded UTF8 sequences.
 *
 * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
 * 1:2 00000000-0000007F 0xxxxxxx
 * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
 * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
 * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
 *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
 *
 * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
 * and the final two forms are used only by full ucs4, per RFC 3629;
 *
 *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
 *   Unicode parlance), being actually UCS-4 characters transformed
 *   through UTF-16, need special treatment: the UTF-16 transformation
 *   must be undone, yielding a UCS-4 character that is then transformed
 *   as above."
 *
 * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
 *
 *  U' = U - 0x10000
 *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
 *                    W1 = 110110yy yyyyyyyy
 *                    W2 = 110111xx xxxxxxxx
 *  Max U' = 0000 00001111 11111111 11111111
 *  Max U  = 0000 00010000 11111111 11111111
 *
 * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
 * which results in these conclusions of maximum allocations;
 *
 *  utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
 *  ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
 */


int is_utf8(const char* str, size_t length) {
	size_t i;
	int nBytes;
	unsigned char chr;

	i = 0;
	nBytes = 0;
	while (i < length) {
		chr = *(str + i);

		if (nBytes == 0) { 
			if ((chr & 0x80) != 0) {
				while ((chr & 0x80) != 0) {
					chr <<= 1;
					nBytes++;
				}
				if ((nBytes < 2) || (nBytes > 6)) {
					return 0; 
				}
				nBytes--;
			}
		}
		else {  
			if ((chr & 0xC0) != 0x80) {
				return 0; 
			}
			nBytes--;
		}
		i++;
	}
	return (nBytes == 0) ? S_SUCCESS : S_ERROR;
}


int utf8_to_ucs2(const char *in,
	size_t *inbytes,
	wchar_t *out,
	size_t *outwords)
{
	int64 newch, mask;
	size_t expect, eating;
	int ch;

	while (*inbytes && *outwords)
	{
		ch = (unsigned char)(*in++);
		if (!(ch & 0200)) {
			/* US-ASCII-7 plain text
			 */
			--*inbytes;
			--*outwords;
			*(out++) = ch;
		}
		else
		{
			if ((ch & 0300) != 0300) {
				/* Multibyte Continuation is out of place
				 */
				return S_ERROR;
			}
			else
			{
				/* Multibyte Sequence Lead Character
				 *
				 * Compute the expected bytes while adjusting
				 * or lead byte and leading zeros mask.
				 */
				mask = 0340;
				expect = 1;
				while ((ch & mask) == mask) {
					mask |= mask >> 1;
					if (++expect > 3) /* (truly 5 for ucs-4) */
						return S_ERROR;
				}
				newch = ch & ~mask;
				eating = expect + 1;
				if (*inbytes <= expect)
					return S_ERROR;
				/* Reject values of excessive leading 0 bits
				 * utf-8 _demands_ the shortest possible byte length
				 */
				if (expect == 1) {
					if (!(newch & 0036))
						return S_ERROR;
				}
				else {
					/* Reject values of excessive leading 0 bits
					 */
					if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
						return S_ERROR;
					if (expect == 2) {
						/* Reject values D800-DFFF when not utf16 encoded
						 * (may not be an appropriate restriction for ucs-4)
						 */
						if (newch == 0015 && ((unsigned char)*in & 0040))
							return S_ERROR;
					}
					else if (expect == 3) {
						/* Short circuit values > 110000
						 */
						if (newch > 4)
							return S_ERROR;
						if (newch == 4 && ((unsigned char)*in & 0060))
							return S_ERROR;
					}
				}
				/* Where the boolean (expect > 2) is true, we will need
				 * an extra word for the output.
				 */
				if (*outwords < (size_t)(expect > 2) + 1)
					break; /* buffer full */
				while (expect--)
				{
					/* Multibyte Continuation must be legal */
					if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
						return S_ERROR;
					newch <<= 6;
					newch |= (ch & 0077);
				}
				*inbytes -= eating;
				/* newch is now a true ucs-4 character
				 *
				 * now we need to fold to ucs-2
				 */
				if (newch < 0x10000)
				{
					--*outwords;
					*(out++) = (wchar_t)newch;
				}
				else
				{
					*outwords -= 2;
					newch -= 0x10000;
					*(out++) = (wchar_t)(0xD800 | (newch >> 10));
					*(out++) = (wchar_t)(0xDC00 | (newch & 0x03FF));
				}
			}
		}
	}
	/* Buffer full 'errors' aren't errors, the client must inspect both
	 * the inbytes and outwords values
	 */
	return S_SUCCESS;
}

int ucs2_to_utf8(const wchar_t *in,
	size_t *inwords,
	char *out,
	size_t *outbytes)
{
	int64 newch, require;
	size_t need;
	char *invout;
	int ch;

	while (*inwords && *outbytes)
	{
		ch = (unsigned short)(*in++);
		if (ch < 0x80)
		{
			--*inwords;
			--*outbytes;
			*(out++) = (unsigned char)ch;
		}
		else
		{
			if ((ch & 0xFC00) == 0xDC00) {
				/* Invalid Leading ucs-2 Multiword Continuation Character
				 */
				return S_ERROR;
			}
			if ((ch & 0xFC00) == 0xD800) {
				/* Leading ucs-2 Multiword Character
				 */
				if (*inwords < 2) {
					/* Missing ucs-2 Multiword Continuation Character
					 */
					return S_ERROR;
				}
				if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
					/* Invalid ucs-2 Multiword Continuation Character
					 */
					return S_ERROR;
				}
				newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
				newch += 0x10000;
			}
			else {
				/* ucs-2 Single Word Character
				 */
				newch = ch;
			}
			/* Determine the absolute minimum utf-8 bytes required
			 */
			require = newch >> 11;
			need = 1;
			while (require)
				require >>= 5, ++need;
			if (need >= *outbytes)
				break; /* Insufficient buffer */
			*inwords -= (need > 2) + 1;
			*outbytes -= need + 1;
			/* Compute the utf-8 characters in last to first order,
			 * calculating the lead character length bits along the way.
			 */
			ch = 0200;
			out += need + 1;
			invout = out;
			while (need--) {
				ch |= ch >> 1;
				*(--invout) = (unsigned char)(0200 | (newch & 0077));
				newch >>= 6;
			}
			/* Compute the lead utf-8 character and move the dest offset
			 */
			*(--invout) = (unsigned char)(ch | newch);
		}
	}
	/* Buffer full 'errors' aren't errors, the client must inspect both
	 * the inwords and outbytes values
	 */
	return S_SUCCESS;
}


char* ucs2_to_char(const wchar_t *in, size_t len)
{
	char* ptr = NULL;
	size_t size = wcstombs(ptr, in, 0);

	ptr = (char*)calloc(size + 1, sizeof(char));
	rc_assert(ptr != NULL, NULL)

	wcstombs(ptr, in, size);


	return ptr;
}

wchar_t* char_to_ucs2(const char *in, size_t len)
{
	
	wchar_t* ptr = NULL;
    size_t size = mbstowcs(ptr, in, len);

	ptr = (wchar_t*)calloc(size + 1, sizeof(wchar_t));
	rc_assert(ptr != NULL, NULL)

	mbstowcs(ptr, in, size);

	return ptr;
}



